{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sentence tokenization\n",
    "\n",
    "In this recipe, we will learn how to count the number of sentences in a piece of text."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from nltk.tokenize import sent_tokenize\n",
    "from sklearn.datasets import fetch_20newsgroups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"\"\"\n",
    "The alarm rang at 7 in the morning as it usually did on Tuesdays. She rolled over,\n",
    "stretched her arm, and stumbled to the button till she finally managed to switch it off.\n",
    "Reluctantly, she got up and went for a shower. The water was cold as the day before the engineers\n",
    "did not manage to get the boiler working. Good thing it was still summer.\n",
    "Upstairs, her cat waited eagerly for his morning snack. Miaow! He voiced with excitement\n",
    "as he saw her climb the stairs.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['\\nThe alarm rang at 7 in the morning as it usually did on Tuesdays.',\n",
       " 'She rolled over,\\nstretched her arm, and stumbled to the button till she finally managed to switch it off.',\n",
       " 'Reluctantly, she got up and went for a shower.',\n",
       " 'The water was cold as the day before the engineers\\ndid not manage to get the boiler working.',\n",
       " 'Good thing it was still summer.',\n",
       " 'Upstairs, her cat waited eagerly for his morning snack.',\n",
       " 'Miaow!',\n",
       " 'He voiced with excitement\\nas he saw her climb the stairs.']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# separate text into sentences\n",
    "\n",
    "sent_tokenize(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count number of sentences\n",
    "\n",
    "len(sent_tokenize(text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>From: lerxst@wam.umd.edu (where's my thing)\\nS...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>From: guykuo@carson.u.washington.edu (Guy Kuo)...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>From: twillis@ec.ecn.purdue.edu (Thomas E Will...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>From: jgreen@amber (Joe Green)\\nSubject: Re: W...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>From: jcm@head-cfa.harvard.edu (Jonathan McDow...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text\n",
       "0  From: lerxst@wam.umd.edu (where's my thing)\\nS...\n",
       "1  From: guykuo@carson.u.washington.edu (Guy Kuo)...\n",
       "2  From: twillis@ec.ecn.purdue.edu (Thomas E Will...\n",
       "3  From: jgreen@amber (Joe Green)\\nSubject: Re: W...\n",
       "4  From: jcm@head-cfa.harvard.edu (Jonathan McDow..."
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# now we do the same for an entire dataframe\n",
    "\n",
    "# load data\n",
    "data = fetch_20newsgroups(subset='train')\n",
    "df = pd.DataFrame(data.data, columns=['text'])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>From: guykuo@carson.u.washington.edu (Guy Kuo)...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>From: twillis@ec.ecn.purdue.edu (Thomas E Will...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>From: jgreen@amber (Joe Green)\\nSubject: Re: W...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>From: jcm@head-cfa.harvard.edu (Jonathan McDow...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\\...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>From: bmdelane@quads.uchicago.edu (brian manni...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>From: bgrubb@dante.nmsu.edu (GRUBB)\\nSubject: ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>From: holmes7000@iscsvax.uni.edu\\nSubject: WIn...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>From: kerr@ux1.cso.uiuc.edu (Stan Kerr)\\nSubje...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>From: irwin@cmptrc.lonestar.org (Irwin Arnstei...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 text\n",
       "1   From: guykuo@carson.u.washington.edu (Guy Kuo)...\n",
       "2   From: twillis@ec.ecn.purdue.edu (Thomas E Will...\n",
       "3   From: jgreen@amber (Joe Green)\\nSubject: Re: W...\n",
       "4   From: jcm@head-cfa.harvard.edu (Jonathan McDow...\n",
       "5   From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\\...\n",
       "6   From: bmdelane@quads.uchicago.edu (brian manni...\n",
       "7   From: bgrubb@dante.nmsu.edu (GRUBB)\\nSubject: ...\n",
       "8   From: holmes7000@iscsvax.uni.edu\\nSubject: WIn...\n",
       "9   From: kerr@ux1.cso.uiuc.edu (Stan Kerr)\\nSubje...\n",
       "10  From: irwin@cmptrc.lonestar.org (Irwin Arnstei..."
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# take the first 10 rows to speed things up\n",
    "\n",
    "df = df.loc[1:10]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11\\nNNTP-Posting-Host: carson.u.washington.ed...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>36\\n\\nwell folks, my mac plus finally gave up...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>14\\nDistribution: world\\nNNTP-Posting-Host: a...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>23\\n\\nFrom article &lt;C5owCB.n3p@world.std.com&gt;...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>58\\n\\nIn article &lt;1r1eu1$4t@transfer.stratus....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>12\\n\\nThere were a few people who responded t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>44\\nDistribution: world\\nNNTP-Posting-Host: d...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>10\\n\\nI have win 3.0 and downloaded several i...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>29\\n\\njap10@po.CWRU.Edu (Joseph A. Pellettier...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>13\\n\\nI have a line on a Ducati 900GTS 1978 m...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 text\n",
       "1    11\\nNNTP-Posting-Host: carson.u.washington.ed...\n",
       "2    36\\n\\nwell folks, my mac plus finally gave up...\n",
       "3    14\\nDistribution: world\\nNNTP-Posting-Host: a...\n",
       "4    23\\n\\nFrom article <C5owCB.n3p@world.std.com>...\n",
       "5    58\\n\\nIn article <1r1eu1$4t@transfer.stratus....\n",
       "6    12\\n\\nThere were a few people who responded t...\n",
       "7    44\\nDistribution: world\\nNNTP-Posting-Host: d...\n",
       "8    10\\n\\nI have win 3.0 and downloaded several i...\n",
       "9    29\\n\\njap10@po.CWRU.Edu (Joseph A. Pellettier...\n",
       "10   13\\n\\nI have a line on a Ducati 900GTS 1978 m..."
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# remove first part of email\n",
    "\n",
    "df['text'] = df['text'].str.split('Lines:').apply(lambda x: x[1])\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 11\n",
      "NNTP-Posting-Host: carson.u.washington.edu\n",
      "\n",
      "A fair number of brave souls who upgraded their SI clock oscillator have\n",
      "shared their experiences for this poll. Please send a brief message detailing\n",
      "your experiences with the procedure. Top speed attained, CPU rated speed,\n",
      "add on cards and adapters, heat sinks, hour of usage per day, floppy disk\n",
      "functionality with 800 and 1.4 m floppies are especially requested.\n",
      "\n",
      "I will be summarizing in the next two days, so please add to the network\n",
      "knowledge base if you have done the clock upgrade and haven't answered this\n",
      "poll. Thanks.\n",
      "\n",
      "Guy Kuo <guykuo@u.washington.edu>\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(df['text'][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>num_sent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11\\nNNTP-Posting-Host: carson.u.washington.ed...</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>36\\n\\nwell folks, my mac plus finally gave up...</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>14\\nDistribution: world\\nNNTP-Posting-Host: a...</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>23\\n\\nFrom article &lt;C5owCB.n3p@world.std.com&gt;...</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>58\\n\\nIn article &lt;1r1eu1$4t@transfer.stratus....</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>12\\n\\nThere were a few people who responded t...</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>44\\nDistribution: world\\nNNTP-Posting-Host: d...</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>10\\n\\nI have win 3.0 and downloaded several i...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>29\\n\\njap10@po.CWRU.Edu (Joseph A. Pellettier...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>13\\n\\nI have a line on a Ducati 900GTS 1978 m...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 text  num_sent\n",
       "1    11\\nNNTP-Posting-Host: carson.u.washington.ed...         6\n",
       "2    36\\n\\nwell folks, my mac plus finally gave up...         9\n",
       "3    14\\nDistribution: world\\nNNTP-Posting-Host: a...         7\n",
       "4    23\\n\\nFrom article <C5owCB.n3p@world.std.com>...        10\n",
       "5    58\\n\\nIn article <1r1eu1$4t@transfer.stratus....        21\n",
       "6    12\\n\\nThere were a few people who responded t...         8\n",
       "7    44\\nDistribution: world\\nNNTP-Posting-Host: d...        15\n",
       "8    10\\n\\nI have win 3.0 and downloaded several i...         3\n",
       "9    29\\n\\njap10@po.CWRU.Edu (Joseph A. Pellettier...        12\n",
       "10   13\\n\\nI have a line on a Ducati 900GTS 1978 m...        11"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['num_sent'] = df['text'].apply(sent_tokenize).apply(len)\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "fsml",
   "language": "python",
   "name": "fsml"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
